Forest Loss and the Rise of Malaria Incidence Rate in Sub-Saharan Africa

Data Preparation

setwd("~/STAT 482")

library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(reshape2)
library(stringr)
library(openxlsx)
library(patchwork)
library(purrr)
library(GGally)
library(plotly)
library(scales)
library(ggcorrplot)
library(nlme)
library(readxl)
library(ggrepel)


# turn off scientific notation format output
options(scipen = 999)

# The clean_char function takes in a character and returns a number without the range in brackets and without spaces in the mean. 
# For example, "193 232 [149 000-245 000]" becomes 193232
clean_char = function(text){
  return(as.numeric(gsub(" ", "", sub("\\[.*", "", text))))
}

Population by Country

# Import Subsaharan African countries list
countries = read_excel("~/STAT 482/subsaharan_countries.xlsx", sheet = "Countries", col_names =  TRUE)

population = read_excel("~/STAT 482/Population.xls", sheet = "Data", col_names =  TRUE)

# filter to get only sub-Saharan African countries
population = population[population$`Country Name` %in% countries$country_name,]

# keep only years 2000-2020, remove Indicator Name and Indicator Code cols.
to_remove = as.character(c(1960:1999,2021:2023, "Indicator Code", "Indicator Name"))
population = population[,!(colnames(population) %in% to_remove)]

Forest Cover Data

# import forest cover data by country and the sub-Saharan countries list
forest = read.csv("forest_coverage_data_by_country.xls - Data.csv")

# take out the first two rows of forest, which contain data we don't need
forest = forest[-c(1,2),]

# make the first row of the data the headers
colnames(forest) = as.character(unlist(forest[1,]))
forest = forest[-1,]

# filter to get only sub-Saharan African countries
forest = forest[forest$`Country Name` %in% countries$country_name,]

# keep only years 2000-2020, remove Indicator Name and Indicator Code cols.
forest = forest[,!(colnames(forest) %in% to_remove)]

Malaria Case Data

# import malaria case number data by country
malaria = read_xlsx("estimated_malaria_cases.xlsx")

# filter to get only Sub-saharan African countries
malaria = malaria[malaria$`Country Name` %in% countries$country_name,]


# loop through all numeric columns in malaria, applying clean_char function
col_mal = colnames(malaria) # list of column names of malaria data
for (i in 2:length(col_mal)){
  malaria[col_mal[i]] = sapply(malaria[,i], clean_char)}


# remove column for year 2021
malaria = malaria[,!(colnames(malaria) %in% to_remove)]

# create table with country name and country code for joining
country_name_code = forest[,c(1:2)] 

# join the country code to the malaria dataset by the country name
malaria = left_join(malaria, country_name_code, by = "Country Name")
Normalize Malaria Case Data by Population per Country
# Reshape to long format
malaria_long = malaria %>%
  pivot_longer(cols = starts_with("20"), names_to = "year", values_to = "cases")

population_long = population %>%
  pivot_longer(cols = starts_with("20"), names_to = "year", values_to = "population")

# Join the data
combined_data = malaria_long %>%
  left_join(population_long, by = c("Country Name", "Country Code", "year"))

# Normalize the data
normalized_data = combined_data %>%
  mutate(normalized_cases = cases / population)

# Reshape back to wide format
malaria = normalized_data %>%
  select("Country Name", "Country Code", year, normalized_cases) %>%
  pivot_wider(names_from = year, values_from = normalized_cases)

Rainfall Data

rainfall = read_excel("~/STAT 482/rainfall_data.xls", sheet = "Data", col_names =  TRUE) 

rainfall = rainfall[rainfall$`Country Name` %in% countries$country_name,]
rainfall = rainfall[,!(colnames(rainfall) %in% to_remove)]

GDP per Capita Data

gdp = read_excel("~/STAT 482/GDP_per_capita_total.xls", sheet = "Data", col_names = TRUE)

gdp = gdp[gdp$`Country Name` %in% countries$country_name,]
gdp = gdp[,!(colnames(gdp) %in% to_remove)]

Urbanization Rate Data

urbanization = read_excel("~/STAT 482/urbanization_rate_data.xls", sheet = "Data", col_names = TRUE)

urbanization = urbanization[urbanization$`Country Name` %in% countries$country_name,]
urbanization = urbanization[,!(colnames(urbanization) %in% to_remove)]

Combine Data into one Dataframe

# reorder the 'country' variable in order of most north to most south in latitude
countries_ordered = c("Eritrea", "Djibouti", "Chad", "Niger", "Mali", "Mauritania", "Sudan", "Ethiopia", "Somalia", "Central African Republic", "Cameroon", "Nigeria", "Benin", "Burkina Faso", "Ghana", "Togo", "Equatorial Guinea", "Gabon", "Congo, Rep.", "Congo, Dem. Rep.", "Uganda", "Kenya", "Rwanda", "Burundi", "Tanzania", "Angola", "Zambia", "Malawi", "Mozambique", "Zimbabwe", "Botswana", "Namibia", "South Africa", "Eswatini", "Madagascar", "Senegal", "Gambia, The", "Guinea-Bissau", "Guinea", "Sierra Leone", "Liberia", "Sao Tome and Principe")

# assign the ordered country names to malaria and forest dataset
malaria$`Country Name` = factor(malaria$`Country Name`, levels = countries_ordered)
forest$`Country Name` = factor(forest$`Country Name`, levels = countries_ordered)


# reshape datasets to long format
malaria_long = malaria %>%
  pivot_longer(cols = starts_with("2"), names_to = "Year", values_to = "Malaria_Incidence")

forest_long = forest %>%
  pivot_longer(cols = starts_with("2"), names_to = "Year", values_to = "Forest_Cover")

rainfall_long = rainfall %>%
  pivot_longer(cols = starts_with("2"), names_to = "Year", values_to = "Rainfall_Depth")

gdp_long = gdp %>%
  pivot_longer(cols = starts_with("2"), names_to = "Year", values_to = "GDP_per_Capita")

urban_long = urbanization %>%
  pivot_longer(cols = starts_with("2"), names_to = "Year", values_to = "Urbanization_Perc")

# Merge datasets
combined_data = malaria_long %>%
  left_join(forest_long, by = c("Country Name", "Year")) %>%
  left_join(rainfall_long, by = c("Country Name", "Year")) %>%
  left_join(gdp_long, by = c("Country Name", "Year")) %>%
  left_join(urban_long, by = c("Country Name", "Year"))

combine_remove = as.character(c("Country Code.x", "Country Code.x.x", "Country Code.y.y", "Country Code.y"))
combined_data = combined_data[,!(colnames(combined_data) %in% combine_remove)]

Handle NAs

temp = combined_data

# Filter the dataset for non-missing GDP per capita values and for Eritrea
eritrea_data = temp[!is.na(temp$GDP_per_Capita) & temp$`Country Name` == "Eritrea", "GDP_per_Capita"]

# Calculate the average GDP per capita for Eritrea
eritreaGDPavg = sum(eritrea_data) / nrow(eritrea_data)

# Replace NA values with the calculated average
temp$GDP_per_Capita[is.na(temp$GDP_per_Capita) & temp$`Country Name` == "Eritrea"] = eritreaGDPavg


# Filter the dataset for non-missing Rainfall_Depth values and for Sudan
sudan_data = temp[!is.na(temp$Rainfall_Depth) & temp$`Country Name` == "Sudan", "Rainfall_Depth"]

# Calculate the average Rainfall_Depth for Sudan
sudanRainfallAvg = sum(sudan_data) / nrow(sudan_data)

# Replace NA values with the calculated average
temp$Rainfall_Depth[is.na(temp$Rainfall_Depth) & temp$`Country Name` == "Sudan"] = sudanRainfallAvg

combined_data = temp

Report 1: Exploratory Data Analysis

Scatter plots of Malaria Incidence and Forest Cover Percentage by Country

# make the year numeric for plotting
combined_data$Year = as.numeric(combined_data$Year)


# forest scatterplot of Year vs Forest Cover Percentage by Country
ggplot(data = combined_data, aes(x = Year, 
                                 y = Forest_Cover, 
                                 color = `Country Name`)) +
  geom_point(size = 3) +
  labs(title = "Year vs Forest Cover Percentage", 
       x = "Year", 
       y = "Forest Cover Percentage") +
  theme(
    legend.text = element_text(size = 10),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15), 
    plot.title = element_text(size = 20), 
    axis.text = element_text(size = 9)
  ) +
  guides(color = guide_legend(ncol = 1, title = NULL))

# malaria scatterplot of Year vs Malaria Incidence by Country
ggplot(data = combined_data, aes(x = Year, 
                                 y = Malaria_Incidence, 
                                 color = `Country Name`)) +
  geom_point(size = 3) +
  labs(title = "Year vs Malaria Incidence", 
       x = "Year", 
       y = "Malaria Incidence") +
  theme(
    legend.text = element_text(size = 10),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15), 
    plot.title = element_text(size = 20), 
    axis.text = element_text(size = 9)
  ) +
  guides(color = guide_legend(ncol = 1, title = NULL))

Box plots of Malaria Incidence and Forest Cover Percentage by Country (one plot)

# boxplot of malaria incidence by country - all in one plot
ggplot(data = combined_data, aes(x = `Country Name`, y = Malaria_Incidence)) +
  geom_boxplot(fill = "lightblue", color = "black", outlier.colour = "red", outlier.shape = 16) +
  labs(title = "Boxplot of Malaria Incidence by Country", x = "Country", y = "Malaria Incidence") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 20),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15),
    axis.text = element_text(size = 12),
    axis.text.x = element_text(angle = 45, hjust = 1))

# boxplot of forest cover percentage by country - all in one plot
ggplot(data = combined_data, aes(x = `Country Name`, y = Forest_Cover)) +
  geom_boxplot(fill = "lightblue", color = "black", outlier.colour = "red", outlier.shape = 16) +
  labs(title = "Boxplot of Forest Cover Percentage by Country", x = "Country", y = "Forest Cover Percentage") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 20),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15),
    axis.text = element_text(size = 12),
    axis.text.x = element_text(angle = 45, hjust = 1))

Heat map of Malaria Incidence and Forest Cover Percentage by Country

# format forest cover data for heatmap compatability
forest_long = melt(combined_data, id.vars = c("Year", "Country Name"), 
                           measure.vars = "Forest_Cover")

ggplot(forest_long, aes(x = Year, y = `Country Name`, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "white", high = "forest green") +
  labs(title = "Heatmap of Forest Cover Percentage by Year and Country", 
       x = "Year", 
       y = "Country", 
       fill = "Forest Cover Percentage") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 20),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15),
    axis.text = element_text(size = 12))

# format malaria data for heatmap compatability
malaria_long = melt(combined_data, id.vars = c("Year", "Country Name"), 
                           measure.vars = "Malaria_Incidence")

ggplot(malaria_long, aes(x = Year, y = `Country Name`, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "white", high = "red") +
  labs(title = "Heatmap of Malaria Incidence by Year and Country", 
       x = "Year", 
       y = "Country", 
       fill = "Malaria Incidence") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 20),
    axis.title.x = element_text(size = 15),
    axis.title.y = element_text(size = 15),
    axis.text = element_text(size = 12))

Scatterplots by Country - Malaria and Forest Cover per Year

# Scatter plot for all countries and years

# Split the data by country
split_data = combined_data %>% split(.$`Country Name`)

# Create a list of plots for each country
plots_scatter = map2(split_data, names(split_data), ~ {
  ggplot(.x, aes(x = Year)) +
    geom_line(aes(y = `Forest_Cover`, color = "Forest Cover %"), linewidth = 1) +
    geom_line(aes(y = `Malaria_Incidence` * 100, color = "Malaria Incidence"), linewidth = 1) +  # Scale Malaria Incidence
    scale_y_continuous(
      name = "Forest Cover %",
      sec.axis = sec_axis(~ . / 100, name = "% of Population with Malaria")  # Adjust the secondary axis
    ) +
    labs(title = paste("Forest Cover % and % of Population with Malaria in", .y)) +
    theme(legend.position = "bottom") +
    scale_color_manual(
      name = "Legend",
      values = c("Forest Cover %" = "Forest Green", "Malaria Incidence" = "red")
    )
})

# Combine all plots into one display with free scales
combined_plot_scatter = wrap_plots(plots_scatter, ncol = 2)

# Display the combined plot
print(combined_plot_scatter)

Scatterplots by Country - All Covariates and Response

# Function to create side-by-side histograms for a single country
create_histograms = function(country_data, country_name) {
  malaria_plot = ggplot(country_data, aes(x = Year, y = `Malaria_Incidence`)) +
    geom_col(fill = "red", alpha = 0.7) +
    labs(title = paste("% of Population with Malaria in", country_name), x = "Year", y = "% of Population with Malaria") +
    theme_minimal()
  
  forest_plot = ggplot(country_data, aes(x = Year, y = `Forest_Cover`)) +
    geom_col(fill = "Forest Green", alpha = 0.7) +
    labs(title = paste("Forest Cover % in", country_name), x = "Year", y = "Forest Cover %") +
    theme_minimal()
  
  rainfall_plot = ggplot(country_data, aes(x = Year, y = `Rainfall_Depth`)) +
    geom_col(fill = "blue", alpha = 0.7) +
    labs(title = paste("Rainfall Depth in", country_name), x = "Year", y = "Rainfall Depth mm^3") +
    theme_minimal()
  
  gdp_plot = ggplot(country_data, aes(x = Year, y = `GDP_per_Capita`)) +
    geom_col(fill = "yellow", alpha = 0.7) +
    labs(title = paste("GDP per Capita in", country_name), x = "Year", y = "GDP per Capita, $US") +
    theme_minimal()
  
  urbanization_plot = ggplot(country_data, aes(x = Year, y = `Urbanization_Perc`)) +
    geom_col(fill = "orange", alpha = 0.7) +
    labs(title = paste("Urbanization Percentage in", country_name), x = "Year", y = "Urbanization %") +
    theme_minimal()
  
  malaria_plot + forest_plot + rainfall_plot + gdp_plot + urbanization_plot
}

# Create a list of plots for each country
plots = combined_data %>%
  split(.$`Country Name`) %>%
  map2(names(.), ~ create_histograms(.x, .y))

# Combine all plots into one display
combined_plot = wrap_plots(plots, ncol = 1)

# Display the combined plot
print(combined_plot)

Correlation Matrix

# Get only data columns and rename them for accurate data labels
corr_data = combined_data[,c("Malaria_Incidence", "Forest_Cover", "Rainfall_Depth", "Urbanization_Perc", "GDP_per_Capita")]
colnames(corr_data) = c("% Population with Malaria", "Forest Cover %", "Rainfall Depth mm^3", "Urbanization %", "GDP per Capita $US")

# Compute correlation matrix
cor_matrix = cor(corr_data)

# Create the correlation matrix plot
ggcorrplot(cor_matrix, 
           method = "circle", 
           type = "full", 
           lab = TRUE, 
           tl.cex = 8,
           colors = c("red", "white", "forest green"),
           title = "Correlation Matrix of Various Factors", ggtheme = theme_gray())


Report 2 - Linear Regression

Centralize Covariates

# Centralize the Covariate to have beta0 be the average malaria rate for an average country
covariates = c("Forest_Cover", "Urbanization_Perc", "Rainfall_Depth", "GDP_per_Capita")

# Centralize the covariates
for (covariate in covariates) {
  combined_data[[paste0(covariate)]] = combined_data[[covariate]] - mean(combined_data[[covariate]], na.rm = TRUE)
}

Fit Linear Regression Models for All Years

Includes model summaries and Normal QQ-Plots

# Fit a linear regression model including all year and country data
fit_overall = lm(Malaria_Incidence ~ Forest_Cover + Urbanization_Perc + Rainfall_Depth + GDP_per_Capita, data = combined_data)
qq_norm_res=qqnorm(residuals(fit_overall),main = "Normal Q-Q Plot - Overall")
qqnorm_model=lm(qq_norm_res$y~qq_norm_res$x)
abline(a=qqnorm_model$coef[1], b=qqnorm_model$coef[2])

summary(fit_overall)
## 
## Call:
## lm(formula = Malaria_Incidence ~ Forest_Cover + Urbanization_Perc + 
##     Rainfall_Depth + GDP_per_Capita, data = combined_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33287 -0.10784  0.00007  0.10261  0.43663 
## 
## Coefficients:
##                       Estimate   Std. Error t value             Pr(>|t|)    
## (Intercept)        0.223655861  0.004662244  47.972 < 0.0000000000000002 ***
## Forest_Cover       0.001492906  0.000289713   5.153  0.00000031671127323 ***
## Urbanization_Perc -0.002757904  0.000344265  -8.011  0.00000000000000361 ***
## Rainfall_Depth     0.000062273  0.000009618   6.475  0.00000000015789517 ***
## GDP_per_Capita    -0.000011143  0.000002412  -4.620  0.00000440820008121 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1385 on 877 degrees of freedom
## Multiple R-squared:  0.2487, Adjusted R-squared:  0.2453 
## F-statistic: 72.58 on 4 and 877 DF,  p-value: < 0.00000000000000022
# Add Year (numerical) as a covariate in the overall model
fit_overall_w_year = lm(Malaria_Incidence ~ Year + Forest_Cover + Urbanization_Perc + Rainfall_Depth + GDP_per_Capita, data = combined_data)
qq_norm_res=qqnorm(residuals(fit_overall_w_year),main = "Normal Q-Q Plot - Overall with Year Covariate")
qqnorm_model=lm(qq_norm_res$y~qq_norm_res$x)
abline(a=qqnorm_model$coef[1], b=qqnorm_model$coef[2])

summary(fit_overall_w_year)
## 
## Call:
## lm(formula = Malaria_Incidence ~ Year + Forest_Cover + Urbanization_Perc + 
##     Rainfall_Depth + GDP_per_Capita, data = combined_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34834 -0.11077  0.00614  0.10182  0.46102 
## 
## Coefficients:
##                       Estimate   Std. Error t value          Pr(>|t|)    
## (Intercept)        6.880020154  1.583550594   4.345 0.000015580802513 ***
## Year              -0.003311624  0.000787833  -4.203 0.000028984521374 ***
## Forest_Cover       0.001294868  0.000290839   4.452 0.000009598182632 ***
## Urbanization_Perc -0.002535194  0.000345130  -7.346 0.000000000000469 ***
## Rainfall_Depth     0.000065568  0.000009560   6.859 0.000000000013117 ***
## GDP_per_Capita    -0.000010113  0.000002402  -4.211 0.000028106302089 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1372 on 876 degrees of freedom
## Multiple R-squared:  0.2636, Adjusted R-squared:  0.2594 
## F-statistic:  62.7 on 5 and 876 DF,  p-value: < 0.00000000000000022
# Print summary of coefficients and R^2
cat("\n", "Overall Model", "\n")
## 
##  Overall Model
print(fit_overall$coefficients)
##       (Intercept)      Forest_Cover Urbanization_Perc    Rainfall_Depth 
##     0.22365586117     0.00149290606    -0.00275790392     0.00006227250 
##    GDP_per_Capita 
##    -0.00001114299
cat("\n", "R^2:", summary(fit_overall)$r.squared, "\n", "R^2 adjusted:", summary(fit_overall)$adj.r.squared, "\n")
## 
##  R^2: 0.2487169 
##  R^2 adjusted: 0.2452903
cat("\n", "Overall Model with Year Covariate", "\n")
## 
##  Overall Model with Year Covariate
print(fit_overall_w_year$coefficients)
##       (Intercept)              Year      Forest_Cover Urbanization_Perc 
##     6.88002015386    -0.00331162403     0.00129486841    -0.00253519351 
##    Rainfall_Depth    GDP_per_Capita 
##     0.00006556804    -0.00001011266
cat("\n", "R^2:", summary(fit_overall_w_year)$r.squared, "\n", "R^2 adjusted:", summary(fit_overall_w_year)$adj.r.squared, "\n")
## 
##  R^2: 0.2635708 
##  R^2 adjusted: 0.2593675

Fit Linear Regression Model per Year

# Fit the model per year

# List to store models
models_list = list()
years = unique(combined_data$Year)

# Fit the model for each year and store in the list
for (year in years) {
  data_year = subset(combined_data, Year == year)
  model = lm(Malaria_Incidence ~ Forest_Cover + Urbanization_Perc + Rainfall_Depth + GDP_per_Capita, data = data_year)
  models_list[[as.character(year)]] = summary(model)
}

# Extract p-values, standard errors, estimates, and t-statistics
p_values = data.frame()
standard_errors = data.frame()
estimates = data.frame()
t_statistics = data.frame()

for (year in names(models_list)) {
  model_summary = models_list[[year]]
  coefs = coef(model_summary)
  
  p_values[year, names(coefs[, "Pr(>|t|)"])] = coefs[, "Pr(>|t|)"]
  standard_errors[year, names(coefs[, "Std. Error"])] = coefs[, "Std. Error"]
  estimates[year, names(coefs[, "Estimate"])] = coefs[, "Estimate"]
  t_statistics[year, names(coefs[, "t value"])] = coefs[, "t value"]
}

p_values$Year = c(2000:2020)
standard_errors$Year = c(2000:2020)
estimates$Year = c(2000:2020)
t_statistics$Year = c(2000:2020)

# Convert to long format for plotting
p_values_long = melt(p_values, id.vars = "Year", variable.name = "Variable", value.name = "P_Value")
standard_errors_long = melt(standard_errors, id.vars = "Year", variable.name = "Variable", value.name = "Standard_Error")
estimates_long = melt(estimates, id.vars = "Year", variable.name = "Variable", value.name = "Estimate")
t_statistics_long = melt(t_statistics, id.vars = "Year", variable.name = "Variable", value.name = "T_Value")
Plot Statistics from Models per Year
# Plot the statistics from models per year

# Define a color palette for the variables including the intercept
variable_colors = c(
  "(Intercept)" = "#9467bd",
  "Forest_Cover" = "#1f77b4",
  "Urbanization_Perc" = "#ff7f0e",
  "Rainfall_Depth" = "#2ca02c",
  "GDP_per_Capita" = "#d62728"
)

# Plotting p-values
ggplot(p_values_long, aes(x = Year, y = P_Value, color = Variable)) +
  geom_line() +
  geom_point() +
  scale_color_manual(values = variable_colors) +
  labs(title = "P-Values Over Years", y = "P-Value")

# Plotting estimates for each variable
for (variable in unique(estimates_long$Variable)) {
  plot = ggplot(subset(estimates_long, Variable == variable), aes(x = Year, y = Estimate, color = Variable)) +
    geom_line() +
    geom_point() +
    scale_color_manual(values = variable_colors) +
    labs(title = paste("Estimates Over Years for", variable), y = "Estimate") +
    theme_minimal()
  
  print(plot)  # Display the plot
}

Goodness-of-Fit for Models per Year

# Perform Goodness-of-Fit and plot results

# List to store models
models_list = list()

# Unique years in the data
years = unique(combined_data$Year)

# Data frame to store goodness-of-fit metrics
goodness_of_fit = data.frame(Year = integer(), p_value = numeric(), Adjusted_R_squared = numeric())

# Fit the model for each year and store the goodness-of-fit metrics
for (year in years) {
  data_year = subset(combined_data, Year == year)
  
  # Fit the model
  model = lm(Malaria_Incidence ~ Forest_Cover + Urbanization_Perc + Rainfall_Depth + GDP_per_Capita, data = data_year)
  
  # Get the summary of the model
  model_summary = summary(model)
  
  # Perform the F-test
  f_statistic = model_summary$fstatistic["value"]
  numdf = model_summary$fstatistic["numdf"]
  dendf = model_summary$fstatistic["dendf"]
  p_value = pf(f_statistic, numdf, dendf, lower.tail = FALSE)
  
  # Extract Adjusted R-squared
  adjusted_r_squared = model_summary$adj.r.squared
  
  # Store the metrics in the data frame
  goodness_of_fit = rbind(goodness_of_fit, data.frame(Year = year, p_value = p_value, Adjusted_R_squared = adjusted_r_squared))
}

# Print the goodness-of-fit metrics
print(goodness_of_fit)
##         Year      p_value Adjusted_R_squared
## value   2000 0.0002616539         0.36988950
## value1  2001 0.0002222894         0.37578633
## value2  2002 0.0004723288         0.34798352
## value3  2003 0.0009651713         0.32027961
## value4  2004 0.0029776304         0.27367314
## value5  2005 0.0289883739         0.16611066
## value6  2006 0.0723187488         0.11616710
## value7  2007 0.1057996016         0.09379452
## value8  2008 0.1261010854         0.08308786
## value9  2009 0.0759991471         0.11330789
## value10 2010 0.0898775097         0.10351748
## value11 2011 0.1248323675         0.08371185
## value12 2012 0.1202155183         0.08602937
## value13 2013 0.1067189112         0.09327288
## value14 2014 0.1062453761         0.09354109
## value15 2015 0.0616321324         0.12526463
## value16 2016 0.0489940537         0.13803682
## value17 2017 0.0416193206         0.14692548
## value18 2018 0.0327940154         0.15964481
## value19 2019 0.0243111084         0.17520237
## value20 2020 0.0180159557         0.19034745
# Plot p-value over the years
ggplot(goodness_of_fit, aes(x = Year, y = p_value)) +
  geom_line() +
  geom_point() +
  labs(title = "P-Value Over Years", x = "Year", y = "P-Value") +
  theme_minimal()

# Plot Adjusted R-squared over the years
ggplot(goodness_of_fit, aes(x = Year, y = Adjusted_R_squared)) +
  geom_line() +
  geom_point() +
  labs(title = "Adjusted R-Squared Over Years", x = "Year", y = "Adjusted R-Squared") +
  theme_minimal()

Report 3: Mixed Effects Model

Model Selection

# DATA PREP
# Rename 'Country Name' to 'country_name'
mixed_data = combined_data %>%
  dplyr::rename(country_name = `Country Name`)

# Convert 'country_name' to a factor
mixed_data$country_name = as.factor(mixed_data$country_name)

# FIT MODELS
# only random forest cover
fit.nlme1 = lme(Malaria_Incidence ~ Rainfall_Depth + Urbanization_Perc + Forest_Cover, 
                 random = ~ Forest_Cover | country_name, 
                 data = mixed_data, 
                 control = lmeControl(opt = "optim", maxIter = 1000, msMaxIter = 1000))


# random forest cover and rainfall depth
fit.nlme2 = lme(Malaria_Incidence ~ Rainfall_Depth + Urbanization_Perc + Forest_Cover, 
                 random = ~ Rainfall_Depth + Forest_Cover | country_name, 
                 data = mixed_data, 
                 control = lmeControl(opt = "optim", maxIter = 1000, msMaxIter = 1000))


# remove rainfall depth from model, only forest cover is random 
fit.nlme3 = lme(Malaria_Incidence ~ Urbanization_Perc + Forest_Cover + Year, 
                 random = ~ Forest_Cover | country_name, 
                 data = mixed_data, 
                 control = lmeControl(opt = "optim", maxIter = 1000, msMaxIter = 1000))


# Compare models
anova(fit.nlme1, fit.nlme2, fit.nlme3)
##           Model df       AIC       BIC   logLik   Test  L.Ratio p-value
## fit.nlme1     1  8 -2457.784 -2419.563 1236.892                        
## fit.nlme2     2 11 -2458.491 -2405.937 1240.246 1 vs 2 6.707129  0.0818
## fit.nlme3     3  8 -2463.428 -2425.206 1239.714 2 vs 3 1.063598  0.7859
summary(fit.nlme1) # only forest cover is random
## Linear mixed-effects model fit by REML
##   Data: mixed_data 
##         AIC       BIC   logLik
##   -2457.784 -2419.563 1236.892
## 
## Random effects:
##  Formula: ~Forest_Cover | country_name
##  Structure: General positive-definite, Log-Cholesky parametrization
##              StdDev     Corr  
## (Intercept)  0.34598078 (Intr)
## Forest_Cover 0.01562161 0.583 
## Residual     0.04788059       
## 
## Fixed effects:  Malaria_Incidence ~ Rainfall_Depth + Urbanization_Perc + Forest_Cover 
##                         Value  Std.Error  DF   t-value p-value
## (Intercept)        0.20496958 0.06319732 837  3.243327  0.0012
## Rainfall_Depth    -0.00006071 0.00008565 837 -0.708788  0.4787
## Urbanization_Perc -0.00620376 0.00066010 837 -9.398216  0.0000
## Forest_Cover       0.01055866 0.00320217 837  3.297349  0.0010
##  Correlation: 
##                   (Intr) Rnfl_D Urbn_P
## Rainfall_Depth    -0.111              
## Urbanization_Perc  0.089 -0.370       
## Forest_Cover       0.582 -0.334  0.384
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -3.58766232 -0.49648480 -0.05907523  0.46003302 10.40481208 
## 
## Number of Observations: 882
## Number of Groups: 42
summary(fit.nlme2) # forest cover and rainfall depth are random
## Linear mixed-effects model fit by REML
##   Data: mixed_data 
##         AIC       BIC   logLik
##   -2458.491 -2405.937 1240.246
## 
## Random effects:
##  Formula: ~Rainfall_Depth + Forest_Cover | country_name
##  Structure: General positive-definite, Log-Cholesky parametrization
##                StdDev       Corr         
## (Intercept)    0.2922517064 (Intr) Rnfl_D
## Rainfall_Depth 0.0003672479 -0.417       
## Forest_Cover   0.0161291269  0.413 -0.997
## Residual       0.0479209915              
## 
## Fixed effects:  Malaria_Incidence ~ Rainfall_Depth + Urbanization_Perc + Forest_Cover 
##                         Value  Std.Error  DF   t-value p-value
## (Intercept)        0.24197768 0.05066426 837  4.776102  0.0000
## Rainfall_Depth    -0.00014014 0.00010088 837 -1.389165  0.1652
## Urbanization_Perc -0.00644862 0.00065258 837 -9.881674  0.0000
## Forest_Cover       0.01059189 0.00327504 837  3.234123  0.0013
##  Correlation: 
##                   (Intr) Rnfl_D Urbn_P
## Rainfall_Depth    -0.269              
## Urbanization_Perc  0.036 -0.290       
## Forest_Cover       0.415 -0.760  0.345
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -3.59814097 -0.49405252 -0.05963247  0.46838769 10.39570577 
## 
## Number of Observations: 882
## Number of Groups: 42
summary(fit.nlme3) # remove rainfall depth from model, only forest cover random
## Linear mixed-effects model fit by REML
##   Data: mixed_data 
##         AIC       BIC   logLik
##   -2463.428 -2425.206 1239.714
## 
## Random effects:
##  Formula: ~Forest_Cover | country_name
##  Structure: General positive-definite, Log-Cholesky parametrization
##              StdDev     Corr  
## (Intercept)  0.29737878 (Intr)
## Forest_Cover 0.01355455 0.605 
## Residual     0.04835353       
## 
## Fixed effects:  Malaria_Incidence ~ Urbanization_Perc + Forest_Cover + Year 
##                        Value Std.Error  DF   t-value p-value
## (Intercept)        2.2231455 1.1631920 837  1.911245  0.0563
## Urbanization_Perc -0.0052136 0.0009270 837 -5.624298  0.0000
## Forest_Cover       0.0082939 0.0026987 837  3.073301  0.0022
## Year              -0.0010048 0.0005785 837 -1.737041  0.0827
##  Correlation: 
##                   (Intr) Urbn_P Frst_C
## Urbanization_Perc  0.777              
## Forest_Cover      -0.157  0.032       
## Year              -0.999 -0.776  0.185
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -3.55810483 -0.48863561 -0.07330984  0.46453868 10.44061960 
## 
## Number of Observations: 882
## Number of Groups: 42

Model Visualizations

# Create visuals for the model with only Forest_Cover as the random effect

# Extract residuals from the model
residuals = residuals(fit.nlme3)

# Create QQ plot
qqnorm(residuals)
qqline(residuals, col = "red")

# Extract fixed effects coefficients and their confidence intervals
fixed_effects <- summary(fit.nlme3)$tTable
fixed_effects_df <- data.frame(
  Term = rownames(fixed_effects),
  Estimate = fixed_effects[, "Value"],
  Std_Error = fixed_effects[, "Std.Error"],
  t_value = fixed_effects[, "t-value"],
  P_Value = fixed_effects[, "p-value"]
)
fixed_effects_df$CI_Lower <- fixed_effects_df$Estimate - 1.96 * fixed_effects_df$Std_Error
fixed_effects_df$CI_Upper <- fixed_effects_df$Estimate + 1.96 * fixed_effects_df$Std_Error

# Remove the intercept from the data frame
fixed_effects_df <- fixed_effects_df[fixed_effects_df$Term != "(Intercept)", ]

# Plot fixed effects coefficients with their confidence intervals
ggplot(fixed_effects_df, aes(x = Term, y = Estimate)) +
  geom_point() +
  geom_errorbar(aes(ymin = CI_Lower, ymax = CI_Upper), width = 0.2) +
  labs(title = "Fixed Effects Coefficients with Confidence Intervals", x = "Estimate", y = "Value") +
  theme_minimal()

# Extract random effects for each country
random_effects <- ranef(fit.nlme3)
random_effects_df <- as.data.frame(random_effects)

# Rename columns to avoid issues with special characters
colnames(random_effects_df) <- c("Random_Intercept", "Random_Slope_Forest_Cover")

# Plot random effects for each country with non-overlapping text labels
ggplot(random_effects_df, aes(x = Random_Intercept, y = Random_Slope_Forest_Cover, label = rownames(random_effects_df))) +
  geom_point() +
  geom_text_repel(size = 2.5) +
  labs(title = "Random Effects for Each Country", x = "Random Intercept", y = "Random Slope for Forest Cover") +
  theme_minimal()

# Plot residuals vs fitted values
residuals <- resid(fit.nlme3)
fitted_values <- fitted(fit.nlme3)
residuals_df <- data.frame(Fitted_Values = fitted_values, Residuals = residuals)

ggplot(residuals_df, aes(x = Fitted_Values, y = Residuals)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title = "Residuals vs Fitted Values", x = "Fitted Values", y = "Residuals") +
  theme_minimal()